Frame

setwd("~/Dropbox/github/art-data-science/notebook/onion")
library(rvest)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(plotly)
library(ggmap)
Google Maps API Terms of Service: http://developers.google.com/maps/terms.
Please cite ggmap if you use it: see citation('ggmap') for details.

Attaching package: ‘ggmap’

The following object is masked from ‘package:plotly’:

    wind

Acquire

pg.out <- read_html("MonthWiseMarketArrivalsNew.htm")
pg.table <- pg.out %>%
            html_node("#dnn_ctr974_MonthWiseMarketArrivals_GridView1") %>%
            html_table()
str(pg.table)
'data.frame':   3784 obs. of  7 variables:
 $ Market              : chr  "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
 $ Month Name          : chr  "January" "January" "January" "February" ...
 $ Year                : chr  "2014" "2015" "2017" "2014" ...
 $ Arrival (q)         : int  440 1305 200 1115 1115 1300 920 670 1350 940 ...
 $ Price Minimum (Rs/q): chr  "1025" "1309" "750" "831" ...
 $ Price Maximum (Rs/q): chr  "1481" "1858" "1000" "1163" ...
 $ Modal Price (Rs/q)  : chr  "1256" "1613" "850" "983" ...
df <- pg.table

Refine

df$quantity <- as.numeric(df$quantity)
df$year     <- as.numeric(df$year)
df$priceMin <- as.numeric(df$priceMin)
df$priceMax <- as.numeric(df$priceMax)
df$priceMod <- as.numeric(df$priceMod)
str(df)
'data.frame':   3783 obs. of  7 variables:
 $ market  : chr  "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
 $ month   : chr  "January" "January" "January" "February" ...
 $ year    : num  2014 2015 2017 2014 2015 ...
 $ quantity: num  440 1305 200 1115 1115 ...
 $ priceMin: num  1025 1309 750 831 1200 ...
 $ priceMax: num  1481 1858 1000 1163 1946 ...
 $ priceMod: num  1256 1613 850 983 1688 ...
df <- df %>%
      mutate(market1 = market) %>%
      separate(market1, c("city", "state"), sep = "\\(")
Too many values at 99 locations: 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, ...Too few values at 535 locations: 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, ...
df$state <- df$state %>% str_replace("\\)","")
df <- df %>%
      mutate(state = ifelse(is.na(state), market, state))
head(df)
df <- df %>%
      mutate(date = paste(month, year, sep="-"))
df$date = as.Date(paste("01-",df$date,sep=""), "%d-%B-%Y")
str(df)
'data.frame':   3783 obs. of  10 variables:
 $ market  : chr  "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
 $ month   : chr  "January" "January" "January" "February" ...
 $ year    : num  2014 2015 2017 2014 2015 ...
 $ quantity: num  440 1305 200 1115 1115 ...
 $ priceMin: num  1025 1309 750 831 1200 ...
 $ priceMax: num  1481 1858 1000 1163 1946 ...
 $ priceMod: num  1256 1613 850 983 1688 ...
 $ city    : chr  "ABOHAR" "ABOHAR" "ABOHAR" "ABOHAR" ...
 $ state   : chr  "PB" "PB" "PB" "PB" ...
 $ date    : Date, format: "2014-01-01" "2015-01-01" ...

Transform

df2016 <- df %>%
          filter(year == 2016)
head(df2016)

Split-Apply-Combine

df2016City <- df %>%
              filter(year == 2016) %>%
              group_by(city) %>%
              summarise(quantity_year = sum(quantity)) %>%
              arrange(desc(quantity_year)) %>%
              filter(quantity_year > 2500000)
head(df2016City)

Explore

cities <- unique(df2016City$city)
cities
[1] "BANGALORE"  "MAHUVA"     "PIMPALGAON" "SOLAPUR"    "LASALGAON"  "PUNE"      
[7] "DELHI"      "NEWASA"     "MUMBAI"    
dfCity <- df %>%
          filter( city %in% cities)
dim(dfCity)  
[1] 348  10
ggplot(dfCity) + aes(date, priceMod, color=city) + geom_line()

g <- ggplot(dfCity) + aes(date, priceMod, color=city) + geom_line()
ggplotly(g)
dfCityTall <- dfCity %>%
              gather("priceType", "priceValue",5:7) %>%
              arrange(date)
ggplot(dfCityTall) + aes(date, y = priceValue, color = priceType) + geom_line() + facet_wrap(~city)

Model

dfBang <- df %>%
  filter(city == "BANGALORE") %>%
  select(date, priceMod) %>%
  arrange(date)
ggplot(dfBang) + aes(date, priceMod) + geom_line()

colnames(dfBang) <- c('ds', 'y')
str(dfBang)
'data.frame':   40 obs. of  2 variables:
 $ ds: Date, format: "2014-01-01" "2014-02-01" ...
 $ y : num  1094 797 748 712 941 ...
m <- prophet(dfBang)
Disabling weekly seasonality. Run prophet with `weekly.seasonality=TRUE` to override this.
Initial log joint probability = -3.13771
Optimization terminated normally: 
  Convergence detected: absolute parameter change was below tolerance
future <- make_future_dataframe(m, periods = 12, freq = 'm')
tail(future)
forecast <- predict(m, future)
tail(forecast[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
plot(m, forecast)

prophet_plot_components(m, forecast)

Insight

uniqcity <- unique(dfCity$city)
geo <- geocode(uniqcity)
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=BANGALORE&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=DELHI&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=LASALGAON&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=MAHUVA&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=MUMBAI&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=NEWASA&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PIMPALGAON&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PUNE&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=SOLAPUR&sensor=false
dfGeo <- bind_cols(df2016City, geo)
dfGeo
ggplot(dfGeo) + aes(lon, lat, size=quantity_year/1000) + geom_point() + coord_map()

map <- get_map("India", zoom = 5)
Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=India&zoom=5&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=India&sensor=false
ggmap(map)

map1 <- get_map("India", maptype = "watercolor", source = "stamen", zoom = 5)
Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=India&zoom=5&size=640x640&scale=2&maptype=terrain&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=India&sensor=false
Map from URL : http://tile.stamen.com/watercolor/5/21/12.jpg
Map from URL : http://tile.stamen.com/watercolor/5/22/12.jpg
Map from URL : http://tile.stamen.com/watercolor/5/23/12.jpg
Map from URL : http://tile.stamen.com/watercolor/5/24/12.jpg
Map from URL : http://tile.stamen.com/watercolor/5/21/13.jpg
Map from URL : http://tile.stamen.com/watercolor/5/22/13.jpg
Map from URL : http://tile.stamen.com/watercolor/5/23/13.jpg
Map from URL : http://tile.stamen.com/watercolor/5/24/13.jpg
Map from URL : http://tile.stamen.com/watercolor/5/21/14.jpg
Map from URL : http://tile.stamen.com/watercolor/5/22/14.jpg
Map from URL : http://tile.stamen.com/watercolor/5/23/14.jpg
Map from URL : http://tile.stamen.com/watercolor/5/24/14.jpg
Map from URL : http://tile.stamen.com/watercolor/5/21/15.jpg
Map from URL : http://tile.stamen.com/watercolor/5/22/15.jpg
Map from URL : http://tile.stamen.com/watercolor/5/23/15.jpg
Map from URL : http://tile.stamen.com/watercolor/5/24/15.jpg
ggmap(map1)

ggmap(map1) + geom_point(data = dfGeo,aes(lon,lat,size=quantity_year/1000,color=city))

LS0tCnRpdGxlOiAiT25pb24gUHJpY2UgYW5kIFF1YW50aXR5IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIEZyYW1lCgpgYGB7cn0Kc2V0d2QoIn4vRHJvcGJveC9naXRodWIvYXJ0LWRhdGEtc2NpZW5jZS9ub3RlYm9vay9vbmlvbiIpCmxpYnJhcnkocnZlc3QpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkodGlkeXIpCmxpYnJhcnkoc3RyaW5ncikKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KHBsb3RseSkKbGlicmFyeShnZ21hcCkKYGBgCgojIEFjcXVpcmUKCmBgYHtyfQpwZy5vdXQgPC0gcmVhZF9odG1sKCJNb250aFdpc2VNYXJrZXRBcnJpdmFsc05ldy5odG0iKQpwZy50YWJsZSA8LSBwZy5vdXQgJT4lCiAgICAgICAgICAgIGh0bWxfbm9kZSgiI2Rubl9jdHI5NzRfTW9udGhXaXNlTWFya2V0QXJyaXZhbHNfR3JpZFZpZXcxIikgJT4lCiAgICAgICAgICAgIGh0bWxfdGFibGUoKQpzdHIocGcudGFibGUpCmRmIDwtIHBnLnRhYmxlCmBgYAoKIyBSZWZpbmUgCgpgYGB7cn0KZGltKGRmKQpjb2x1bW5fbmFtZXMgPC0gYygnbWFya2V0JywgJ21vbnRoJywgJ3llYXInLCAncXVhbnRpdHknLCAncHJpY2VNaW4nLCAncHJpY2VNYXgnLCAncHJpY2VNb2QnKQpjb2xuYW1lcyhkZikgPC0gY29sdW1uX25hbWVzCmhlYWQoZGYpCnRhaWwoZGYpCmRmIDwtIGRmICU+JQogICAgICBmaWx0ZXIoeWVhciAhPSAiVG90YWwiKQpkaW0oZGYpCmBgYAoKCmBgYHtyfQpkZiRxdWFudGl0eSA8LSBhcy5udW1lcmljKGRmJHF1YW50aXR5KQpkZiR5ZWFyICAgICA8LSBhcy5udW1lcmljKGRmJHllYXIpCmRmJHByaWNlTWluIDwtIGFzLm51bWVyaWMoZGYkcHJpY2VNaW4pCmRmJHByaWNlTWF4IDwtIGFzLm51bWVyaWMoZGYkcHJpY2VNYXgpCmRmJHByaWNlTW9kIDwtIGFzLm51bWVyaWMoZGYkcHJpY2VNb2QpCnN0cihkZikKYGBgCgoKYGBge3J9CmRmIDwtIGRmICU+JQogICAgICBtdXRhdGUobWFya2V0MSA9IG1hcmtldCkgJT4lCiAgICAgIHNlcGFyYXRlKG1hcmtldDEsIGMoImNpdHkiLCAic3RhdGUiKSwgc2VwID0gIlxcKCIpCmRmJHN0YXRlIDwtIGRmJHN0YXRlICU+JSBzdHJfcmVwbGFjZSgiXFwpIiwiIikKZGYgPC0gZGYgJT4lCiAgICAgIG11dGF0ZShzdGF0ZSA9IGlmZWxzZShpcy5uYShzdGF0ZSksIG1hcmtldCwgc3RhdGUpKQpoZWFkKGRmKQpgYGAgICAgIAoKYGBge3J9CmRmIDwtIGRmICU+JQogICAgICBtdXRhdGUoZGF0ZSA9IHBhc3RlKG1vbnRoLCB5ZWFyLCBzZXA9Ii0iKSkKZGYkZGF0ZSA9IGFzLkRhdGUocGFzdGUoIjAxLSIsZGYkZGF0ZSxzZXA9IiIpLCAiJWQtJUItJVkiKQpzdHIoZGYpCmBgYAogCgojIFRyYW5zZm9ybQpgYGB7cn0KZGYyMDE2IDwtIGRmICU+JQogICAgICAgICAgZmlsdGVyKHllYXIgPT0gMjAxNikKaGVhZChkZjIwMTYpCmBgYAoKU3BsaXQtQXBwbHktQ29tYmluZQpgYGB7cn0KZGYyMDE2Q2l0eSA8LSBkZiAlPiUKICAgICAgICAgICAgICBmaWx0ZXIoeWVhciA9PSAyMDE2KSAlPiUKICAgICAgICAgICAgICBncm91cF9ieShjaXR5KSAlPiUKICAgICAgICAgICAgICBzdW1tYXJpc2UocXVhbnRpdHlfeWVhciA9IHN1bShxdWFudGl0eSkpICU+JQogICAgICAgICAgICAgIGFycmFuZ2UoZGVzYyhxdWFudGl0eV95ZWFyKSkgJT4lCiAgICAgICAgICAgICAgZmlsdGVyKHF1YW50aXR5X3llYXIgPiAyNTAwMDAwKQpoZWFkKGRmMjAxNkNpdHkpCmBgYAoKIyBFeHBsb3JlIAoKYGBge3J9CmdncGxvdChkZjIwMTZDaXR5KSArIAogIGFlcyhyZW9yZGVyKGNpdHksIHF1YW50aXR5X3llYXIpLHdlaWdodCA9IHF1YW50aXR5X3llYXIsIGZpbGw9Y2l0eSkgKyAKICBnZW9tX2JhcigpICsKICBjb29yZF9mbGlwKCkKCmBgYAoKYGBge3J9CmNpdGllcyA8LSB1bmlxdWUoZGYyMDE2Q2l0eSRjaXR5KQpjaXRpZXMKZGZDaXR5IDwtIGRmICU+JQogICAgICAgICAgZmlsdGVyKCBjaXR5ICVpbiUgY2l0aWVzKQpkaW0oZGZDaXR5KSAgCmdncGxvdChkZkNpdHkpICsgYWVzKGRhdGUsIHByaWNlTW9kLCBjb2xvcj1jaXR5KSArIGdlb21fbGluZSgpCgpnIDwtIGdncGxvdChkZkNpdHkpICsgYWVzKGRhdGUsIHByaWNlTW9kLCBjb2xvcj1jaXR5KSArIGdlb21fbGluZSgpCmdncGxvdGx5KGcpCgpgYGAKCmBgYHtyfQpkZkNpdHlUYWxsIDwtIGRmQ2l0eSAlPiUKICAgICAgICAgICAgICBnYXRoZXIoInByaWNlVHlwZSIsICJwcmljZVZhbHVlIiw1OjcpICU+JQogICAgICAgICAgICAgIGFycmFuZ2UoZGF0ZSkKZ2dwbG90KGRmQ2l0eVRhbGwpICsgYWVzKGRhdGUsIHkgPSBwcmljZVZhbHVlLCBjb2xvciA9IHByaWNlVHlwZSkgKyBnZW9tX2xpbmUoKSArIGZhY2V0X3dyYXAofmNpdHkpCgpgYGAKCgojIE1vZGVsCgpgYGB7cn0KZGZCYW5nIDwtIGRmICU+JQogIGZpbHRlcihjaXR5ID09ICJCQU5HQUxPUkUiKSAlPiUKICBzZWxlY3QoZGF0ZSwgcHJpY2VNb2QpICU+JQogIGFycmFuZ2UoZGF0ZSkKCmdncGxvdChkZkJhbmcpICsgYWVzKGRhdGUsIHByaWNlTW9kKSArIGdlb21fbGluZSgpCmBgYAogIApgYGB7cn0KY29sbmFtZXMoZGZCYW5nKSA8LSBjKCdkcycsICd5JykKc3RyKGRmQmFuZykKbSA8LSBwcm9waGV0KGRmQmFuZykKCmZ1dHVyZSA8LSBtYWtlX2Z1dHVyZV9kYXRhZnJhbWUobSwgcGVyaW9kcyA9IDEyLCBmcmVxID0gJ20nKQp0YWlsKGZ1dHVyZSkKZm9yZWNhc3QgPC0gcHJlZGljdChtLCBmdXR1cmUpCnRhaWwoZm9yZWNhc3RbYygnZHMnLCAneWhhdCcsICd5aGF0X2xvd2VyJywgJ3loYXRfdXBwZXInKV0pCgpwbG90KG0sIGZvcmVjYXN0KQpgYGAKCmBgYHtyfQpwcm9waGV0X3Bsb3RfY29tcG9uZW50cyhtLCBmb3JlY2FzdCkKCmBgYAoKCiMgSW5zaWdodAoKYGBge3J9CnVuaXFjaXR5IDwtIHVuaXF1ZShkZkNpdHkkY2l0eSkKZ2VvIDwtIGdlb2NvZGUodW5pcWNpdHkpCmRmR2VvIDwtIGJpbmRfY29scyhkZjIwMTZDaXR5LCBnZW8pCmRmR2VvCmdncGxvdChkZkdlbykgKyBhZXMobG9uLCBsYXQsIHNpemU9cXVhbnRpdHlfeWVhci8xMDAwKSArIGdlb21fcG9pbnQoKSArIGNvb3JkX21hcCgpCmBgYAoKCgpgYGB7cn0KbWFwIDwtIGdldF9tYXAoIkluZGlhIiwgem9vbSA9IDUpCmdnbWFwKG1hcCkKbWFwMSA8LSBnZXRfbWFwKCJJbmRpYSIsIG1hcHR5cGUgPSAid2F0ZXJjb2xvciIsIHNvdXJjZSA9ICJzdGFtZW4iLCB6b29tID0gNSkKZ2dtYXAobWFwMSkKZ2dtYXAobWFwMSkgKyBnZW9tX3BvaW50KGRhdGEgPSBkZkdlbyxhZXMobG9uLGxhdCxzaXplPXF1YW50aXR5X3llYXIvMTAwMCxjb2xvcj1jaXR5KSkKYGBgCgo=